Introduction

  • Jeffrey D. Blume, PhD
    • School of Data Science, University of Virginia
  • Megan H. Murray, PhD
    • Project Statistician, Eli Lilly and Company

Resources:

1 SGPV R Package

There are 2 ways to install sgpv package.

  • CRAN: install.packages("sgpv")
  • GitHub: devtools::install_github("weltybiostat/sgpv")
#GitHub
install.packages("devtools")
devtools::install_github("weltybiostat/sgpv")

#CRAN
install.packages("sgpv")

Functions Included:

  • sgpvalue()
    • This function computes the second-generation p-value (SGPV) and its associated delta gaps, as introduced in Blume et al. (2018).
  • sgpower()
    • Calculate power and type I error values from significance testing based on second-generation p-values as the inferential metric.
  • plotsgpv()
    • This function displays user supplied interval estimates (support intervals, confidence intervals, credible intervals, etc.) according to its associated second-generation p-value ranking.
  • plotman()
    • This function displays a modified Manhattan-style plot colored according to second-generation p-value status.
  • fdrisk()
    • This function computes the false discovery risk (sometimes called the “empirical bayes FDR”) for a second-generation p-value of 0, or the false confirmation risk for a second-generation p-value of 1.

2 Part 2

2.1 Statistical Properties

3 inference outcomes:

  • Data are consistent with null \(p_\delta=1\)
  • Data are consistent with alternative \(p_\delta=0\)
  • Data are inconclusive \(0<p_\delta<1\)

2 underlying truths:

  • Null is true
  • Alternative is true

2.1.1 CIs can miss (bummer)

cidemo = function(rep=100,
                  mu=0.8,
                  n=200,
                  alpha=0.05,
                  ylb="Probability",
                  ltype=1,
                  lcol=4,
                  ltype1=1,
                  lwd1=2,
                  lcol1=2,
                  plotset=F,
                  hilim=1,
                  lolim=0.5,
                  bcol="lemonchiffon2",
                  xleg=45,
                  yleg=0.65,
                  idz.lo=0.75,
                  idz.hi=0.85,
                  zone=TRUE){

  cints=matrix(0,3,rep)
  
  cints[1,]=seq(1,rep,1)
  sigma=sqrt(mu*(1-mu))
  
  for (i in 1:rep) {
  
  dat=rbinom(n,1,prob=mu)
  
  ## Standard normal quantile
  za=abs(qnorm(alpha/2))
  
  ## upper CI limit
  cints[2,i]=mean(dat)+za*sigma/sqrt(n) 
  
  ## Lower CI limit
  cints[3,i]=mean(dat)-za*sigma/sqrt(n)
  }
  
  if (plotset==T){
    lolim=mu-5*sigma/sqrt(n)
    hilim=mu+5*sigma/sqrt(n)
  }
  
  if (sigma!=1){
    plot(cints[1,],cints[2,],ylim=c(lolim,hilim),type="n",
    main=paste("95% Confidence Intervals for the Probability\n Sample Size of ",n," per Endpoint"),
    ylab=ylb,xlab="Replication")
    }
    
  if (sigma==1){
    plot(cints[1,],cints[2,],ylim=c(lolim,hilim),type="n",
    main=paste("95% Confidence Intervals on the Standardized Effect Size \n Sample Size of ",n," per Endpoint"),
    ylab="Standardized Effect Size (X/sigma units)",xlab="Endpoint Identification Number")
    }

  if (zone==TRUE) {         
    polygon(c(0,rep,rep,0),c(idz.hi,idz.hi,idz.lo,idz.lo),density=200,col=bcol)
  }
            
  abline(h=mu)
  points(cints[1,],cints[2,],cex=0.6,pch=16,col=lcol)
  points(cints[1,],cints[3,],cex=0.6,pch=16,col=lcol)
  segments(cints[1,], cints[2,], cints[1,], cints[3,],lty=ltype,col=lcol)
  
  ct=0
  ct2=0
  for (i in 1:rep) {
  
    if (cints[2,i]<mu | cints[3,i]>mu){
        segments(cints[1,i], cints[2,i], 
                 cints[1,i],cints[3,i],
                 lty=ltype1,col=lcol1,lwd=lwd1) 
        points(cints[1,i],cints[2,i],cex=0.6,pch=16,col=lcol1)
        points(cints[1,i],cints[3,i],cex=0.6,pch=16,col=lcol1)
        ct=ct+1}
        
    if (cints[2,i]<idz.lo | cints[3,i]>idz.hi){
      ct2=ct2+1
    }
  }
  
  if (rep>100){
    abline(h=idz.hi,col=bcol,lty=2,lwd=1.5)
    abline(h=idz.lo,col=bcol,lty=2,lwd=1.5)
  }
  
  ## prob of missing indifference zone
  ## P(CI upper limit<idz.lo) +P(CI lower limit>idz.hi) 
  miss.idz=pnorm(-za+idz.lo*sqrt(n)/sigma)+pnorm(-za-idz.hi*sqrt(n)/sigma)
  
  if (zone==TRUE){
    legend(xleg,yleg,bty="n",
    c(paste("Missed True Value:             ",ct," / ",rep,sep=""),
    paste("Missed Indifference Zone:   ",ct2," / ",rep,sep="")))
  }
    
  if (zone!=TRUE){
    legend(xleg,yleg,bty="n",
    c(paste("Missed True Value:             ",
            ct," / ",rep,sep="")))
  }
  
}

set.seed(999)

cidemo(n=75,zone=FALSE)

cidemo(n=75,zone=FALSE)

cidemo(n=400,zone=FALSE)

cidemo(n=400,zone=FALSE)

cidemo(n=75)

cidemo(n=75)

cidemo(n=400)

cidemo(n=400)

prob.alt.sg = function(n, null.delta, 
                    theta0=0, 
                    theta=0,  
                    V=1, 
                    alpha=0.05){
  results = rep(NA, length(n))
  
  results = pnorm(sqrt(n)*(theta0-null.delta)/sqrt(V)-sqrt(n)*theta/sqrt(V)-qnorm(1-alpha/2))+
    pnorm(-1*sqrt(n)*(theta0+null.delta)/sqrt(V)+sqrt(n)*theta/sqrt(V)-qnorm(1-alpha/2))
  
  results
}

Errors and Plots

prob.null.sg = function(n, 
                        null.delta, 
                     theta0=0, 
                     theta=0,
                     V=1, 
                     alpha=0.05){
  results = rep(NA, length(n))
  
  if(length(null.delta)>1){
    con.large = which(null.delta>(qnorm(1-alpha/2)*sqrt(V)/sqrt(n)))
    con.small = which(null.delta<=(qnorm(1-alpha/2)*sqrt(V)/sqrt(n)))
    
    results[con.small] = rep(0, length(con.small))
    results[con.large] =
      pnorm(sqrt(n[con.large])*theta0+sqrt(n[con.large])*null.delta[con.large]/sqrt(V)-sqrt(n[con.large])*theta/sqrt(V)-qnorm(1-alpha/2))-
      pnorm(sqrt(n[con.large])*theta0-sqrt(n[con.large])*null.delta[con.large]/sqrt(V)-sqrt(n[con.large])*theta/sqrt(V)+qnorm(1-alpha/2))
  }else if(null.delta>(qnorm(1-alpha/2)*sqrt(V)/sqrt(n))){ 
    results =
      pnorm(sqrt(n)*theta0+sqrt(n)*null.delta/sqrt(V)-sqrt(n)*theta/sqrt(V)-qnorm(1-alpha/2))-
      pnorm(sqrt(n)*theta0-sqrt(n)*null.delta/sqrt(V)-sqrt(n)*theta/sqrt(V)+qnorm(1-alpha/2))
  }else{
    results = rep(0, length(theta))
  }
  results
}
prob.incon.sg = function(n, null.delta, 
                    theta0=0, 
                    theta=0,  
                    V=1,
                    alpha=0.05){
  results = rep(NA, length(n))
  
  con.large = which(null.delta>(qnorm(1-alpha/2)*sqrt(V)/sqrt(n)))
  con.small = which(null.delta<=(qnorm(1-alpha/2)*sqrt(V)/sqrt(n)))
  
  results = 1-(prob.alt.sg(n, null.delta,
                        theta0, 
                        theta, 
                        V,
                        alpha)+
                 prob.null.sg(n, null.delta,
                           theta0, 
                           theta,
                           V, 
                           alpha))
  
  results
}
theta.1 = seq(-3,3,by=0.01)

plot(theta.1, prob.alt.sg(n=50,
            null.delta = 0,
            theta=theta.1), type="l",
     ylim=c(0,1),
     xlab="Alternative",
     ylab="Probability",
     main="Prob of being consistent with alternative")
lines(theta.1, prob.alt.sg(n=50,
            null.delta = 0.5,
            theta=theta.1),
      col="green")
lines(theta.1, prob.alt.sg(n=50,
            null.delta = 1,
            theta=theta.1),
      col="blue")
abline(h=0.05, lty=2)
legend("bottomright",
       legend=c("Delta=0", "Delta=0.5", "Delta=1"),
       col=c("black", "green","blue"), lty=1)

theta.1 = seq(-3,3,by=0.01)

plot(theta.1, prob.null.sg(n=50,
            null.delta = 0,
            theta=theta.1), type="l",
     ylim=c(0,1),
     xlab="Alternative",
     ylab="Probability",
     main="Prob of being consistent with null")
lines(theta.1, prob.null.sg(n=50,
            null.delta = 0.5,
            theta=theta.1),
      col="green")
lines(theta.1, prob.null.sg(n=50,
            null.delta = 1,
            theta=theta.1),
      col="blue")
abline(h=0.05, lty=2)
legend("bottomright",
       legend=c("Delta=0", "Delta=0.5", "Delta=1"),
       col=c("black", "green","blue"), lty=1)

theta.1 = seq(-3,3,by=0.01)

plot(theta.1, prob.incon.sg(n=50,
            null.delta = 0,
            theta=theta.1), type="l",
     ylim=c(0,1),
     xlab="Alternative",
     ylab="Probability",
     main="Prob of being inconclusive")
lines(theta.1, prob.incon.sg(n=50,
            null.delta = 0.5,
            theta=theta.1),
      col="green")
lines(theta.1, prob.incon.sg(n=50,
            null.delta = 1,
            theta=theta.1),
      col="blue")
abline(h=0.05, lty=2)
legend("bottomright",
       legend=c("Delta=0", "Delta=0.5", "Delta=1"),
       col=c("black", "green","blue"), lty=1)

2.2 SGPV False Discovery Risk

  • False Discovery Rate (FDR)
    • SGPV = 0
  • False Confirmation Rate (FCR)
    • SGPV = 1
  • sgpv::fdrisk()
    • This function computes the false discovery risk (sometimes called the “empirical bayes FDR”) for a second-generation p-value of 0, or the false confirmation risk for a second-generation p-value of 1.
###### FDR rates
# false discovery risk with 95% confidence level
fdrisk(sgpval = 0,  
       null.lo = log(1/1.1), 
       null.hi = log(1.1),  
       std.err = 0.8,  
       null.weights = 'Uniform',  
       null.space = c(log(1/1.1), log(1.1)),  
       alt.weights = 'Uniform',  
       alt.space = 2 + c(-1,1)*qnorm(1-0.05/2)*0.8,  
       interval.type = 'confidence',  
       interval.level = 0.05)
## [1] 0.05949861
## with truncated normal weighting distribution
fdrisk(sgpval = 0,  
       null.lo = log(1/1.1), 
       null.hi = log(1.1),  
       std.err = 0.8,  
       null.weights = 'Point', 
       null.space = 0,  
       alt.weights = 'TruncNormal',  
       alt.space = 2 + c(-1,1)*qnorm(1-0.041/2)*0.8,  
       interval.type = 'likelihood',  interval.level = 1/8)
## [1] 0.04902575
# false discovery risk with LSI and wider null hypothesis
fdrisk(sgpval = 0,  
       null.lo = log(1/1.5), 
       null.hi = log(1.5), 
       std.err = 0.8,  
       null.weights = 'Point', 
       null.space = 0,  
       alt.weights = 'Uniform', 
       alt.space = 2.5 + c(-1,1)*qnorm(1-0.041/2)*0.8,  
       interval.type = 'likelihood',  
       interval.level = 1/8)
## [1] 0.01688343
# false confirmation risk example
fdrisk(sgpval = 1,  
       null.lo = log(1/1.5), 
       null.hi = log(1.5), 
       std.err = 0.15,  
       null.weights = 'Uniform',  
       null.space = 0.01 + c(-1,1)*qnorm(1-0.041/2)*0.15, 
       alt.weights = 'Uniform', 
       alt.space = c(log(1.5), 1.25*log(1.5)), 
       interval.type = 'likelihood',  
       interval.level = 1/8)
## [1] 0.03059522
###
##
#

False Discovery and Confirmation Rates

power.pv=function(n=20,sd=1,alpha=0.05,mu.l=-3,mu.u=3,delta=0.3,mu.0=0,r=1){

  ## Computes power when delta is zero and non-zero
  ## Inputs:
  ## n is sample size 
  ## indifference zone is: mu.0 +/- delta
  ## alternative space is: mu in [mu.l, mu.u]
  ## data ~ N(mu,sd)
  ## r = P(H1)/P(H0) for simple H1 and H0
  
  mu=seq(mu.l,mu.u,0.01)
  
  ## Power (null is mu.0 ; alt is mu)
  pow1.d=pnorm(-sqrt(n)*(delta-(mu.0-mu))/sd-qnorm(1-alpha/2))
  pow2.d=pnorm(-sqrt(n)*(delta+(mu.0-mu))/sd-qnorm(1-alpha/2))
  power.d=pow1.d+pow2.d
  
  ## Power without indifference zone (delta=0)
  pow1.0=pnorm(-sqrt(n)*(0-(mu.0-mu))/sd-qnorm(1-alpha/2))
  pow2.0=pnorm(-sqrt(n)*(0+(mu.0-mu))/sd-qnorm(1-alpha/2))
  power.0=pow1.0+pow2.0
  
  ## Type I error (null is mu.0 ; alt is mu.0)
  alpha.d=2*pnorm(-sqrt(n)*(delta)/sd-qnorm(1-alpha/2))
  
  ## Type I error without indifference zone (delta=0)
  alpha.0=2*pnorm(-sqrt(n)*(0)/sd-qnorm(1-alpha/2))
  
  ## P(sgpv=1|H0) not quite alpha b/c rules out inconclusive
  gam1.0=pnorm(sqrt(n)*(delta)/sd-qnorm(1-alpha/2))
  gam2.0=pnorm(sqrt(n)*(-delta)/sd+qnorm(1-alpha/2))
  gam.0=(gam1.0-gam2.0)*(delta>qnorm(1-alpha/2)*sd/sqrt(n))
  
  ##
  gam1.a=pnorm(sqrt(n)*(delta+mu.0-mu)/sd-qnorm(1-alpha/2))
  gam2.a=pnorm(sqrt(n)*(mu.0-delta-mu)/sd+qnorm(1-alpha/2))
  gam.a=(gam1.a-gam2.a)*(delta>qnorm(1-alpha/2)*sd/sqrt(n))
  
  ## False discovery rates (fdr1=FDR ; fdr0=FNDR)
  fdr.d = 1 / (1 + power.d/alpha.d * r)
  fndr.0 = 1 / (1 + (1-alpha.0)/(1-power.0) * (1/r))
  
  fdr.0 = 1 / (1 + power.0/alpha.0 * r)
  fndr.d = 1 / (1 + gam.0/gam.a * (1/r))
  
  ret=cbind(mu,alpha.d,power.d,fdr.d,fndr.d,
                alpha.0,power.0,fdr.0,fndr.0,gam.0,gam.a)
  colnames(ret)=c('mu','alpha.d','power.d','fdr.d','fndr.d',
                        'alpha.0','power.0','fdr.0','fndr.0',
                            'gam.0','gam.a')
  ret
}
par(mfrow=c(1,1))
prev.ratio=1
set.delta=0.5
set.n=16

all=power.pv(delta=set.delta,n=set.n,r=prev.ratio,alpha=0.05)

plot(all[,'mu'],all[,'fdr.0'],type="n",ylab="Probability",xlab="Alternative in SD units",
     ylim=c(0,prev.ratio/(prev.ratio+1)),xlim=c(-3,3))
title(main="False discovery and confirmation rates")

lines(all[,'mu'],all[,'fdr.0'],col="firebrick",lwd=1,lty=2)
lines(all[,'mu'],all[,'fndr.0'],col="dodgerblue",lwd=1,lty=2)

lines(all[,'mu'],all[,'fdr.d'],col="firebrick",lwd=2)
lines(all[,'mu'],all[,'fndr.d'],col="dodgerblue",lwd=2)

legend("topleft",c("",
                   expression(paste("P-value < ",0.05,sep="")),
                   expression(paste(P[delta]," = 0",sep="")),
                   "", "",
                   expression(paste("P-value > ",0.05,sep="")),
                   expression(paste(P[delta]," = 1",sep=""))),
       col=c("","firebrick","firebrick","","",
             "dodgerblue","dodgerblue"),
       lty=c(NA,2,1,NA,NA,2,1),lwd=c(NA,1,2,NA,NA,1,2),bty="n")

text(-2.875,0.5,"FDR")
text(-2.875,0.42,"FCR")

axis(side = 1, at = c(-set.delta,set.delta),cex.axis=0.8,col.ticks="black",lwd.ticks=1.5,tcl=0.6,labels=FALSE)
axis(side = 3, at = c(-set.delta,set.delta),cex.axis=0.8,col.ticks="black",lwd.ticks=1.5,tcl=0.6,labels=FALSE)

text(0,-0.0125,expression(paste(H[0]," Zone",sep="")),cex=0.8)
text(0,prev.ratio/(prev.ratio+1)*1.015,expression(paste(H[0]," Zone",sep="")),cex=0.8)

text(2.5,0.05/(prev.ratio+0.05)+.01,"alpha/(alpha+r)")

2.3 Study Planning

Purpose: Evaluate how different techniques of setting the indifference zone effect the SGPV study properties.

What options does a collaborator have when they are uncertain of the indifference zone?

Fixed Intervals

n= seq(1,250, by=0.1)
V=1
alpha=0.05

null.delta0 = rep(0.5, length(n))
null.delta1 = rep(0, length(n))
null.delta2 = rep(1.25, length(n))
null.delta3 = rep(1, length(n))
null.delta4 = rep(0.75, length(n))
null.delta5 = rep(0.25, length(n))

ci.bound = qnorm(1-alpha/2)*sqrt(V)/sqrt(n)
plot(n, null.delta1, 
     type="l",
     col="red",
     lwd=2,
     ylim=c(0,3),
     ylab=TeX(r'(Half Interval Width $ (\delta)$)'),
     xlab = "Sample Size (n)",
     main="")
lines(n, null.delta2,
      col="blue",
      lwd=2)
lines(n, null.delta3,
      col="purple",
      lwd=2)
lines(n, null.delta4,
      col="darkgreen",
      lwd=2)
lines(n, null.delta5,
      col="orange",
      lwd=2)
lines(n, null.delta0,
      col="hotpink",
      lwd=2)
lines(n, ci.bound,
      col="black",
      lty=2,
      lwd=2)
# abline(h=0.05, col="black", lty=2)
legend(130, 
       3,
        col=c("black",
             "red",
             "blue",
             "purple",
             "darkgreen",
             "hotpink",
             "orange"),
       box.lty=0,
       lty=c(2,
             1,
             1,
             1,
             1,
             1,
             1),
       cex=0.65,
       lwd=2,
       legend=c(TeX(r'(Uncertainty Interval $\left(\sqrt{1/n}\right)$)'),
                "Point Null: Constant at 0",
                "SGPV: Constant at 1.25", 
                "SGPV: Constant at 1",
                "SGPV: Constant at 0.75",
                "SGPV: Constant at 0.5",
                "SGPV: Constant at 0.25"))

par(mfrow=c(1,2))
par(mar=c(2,2,2,2))

plot(n, prob.alt.sg(n, null.delta1,
                  theta=0), 
     type="l",
     col="red",
     lwd=2,
     ylim=c(0,0.055),
     ylab="", 
     main=TeX(r'(Type 1 Error $\beta_0$)'))
lines(n, prob.alt.sg(n, null.delta2,
                  theta=0),
      col="blue",
      lwd=2)
lines(n, prob.alt.sg(n, null.delta3,
                  theta=0),
      col="purple",
      lwd=2)
lines(n, prob.alt.sg(n, null.delta4,
                  theta=0),
      col="darkgreen",
      lwd=2)
lines(n, prob.alt.sg(n, null.delta5,
                  theta=0),
      col="orange",
      lwd=2)
lines(n, prob.alt.sg(n, null.delta0,
                  theta=0),
      col="hotpink",
      lwd=2)
abline(h=0.05, col="black", lty=2)

plot(n, prob.alt.sg(n, null.delta1,
                  theta=1), 
     type="l",
     col="red",
     lwd=2,
     ylim=c(0,1),
     ylab="", 
     main=TeX(r'(Power $\beta_1$)'))
lines(n, prob.alt.sg(n, null.delta2,
                  theta=1),
      col="blue",
      lwd=2)
lines(n, prob.alt.sg(n, null.delta3,
                  theta=1),
      col="purple",
      lwd=2)
lines(n, prob.alt.sg(n, null.delta4,
                  theta=1),
      col="darkgreen",
      lwd=2)
lines(n, prob.alt.sg(n, null.delta5,
                  theta=1),
      col="orange",
      lwd=2)
lines(n, prob.alt.sg(n, null.delta0,
                  theta=1),
      col="hotpink",
      lwd=2)
legend(70, 0.5,
       col=c("red",
             "blue",
             "purple",
             "darkgreen",
             "hotpink",
             "orange"),
       box.lty=0,
       lty=c(1,
             1,
             1,
             1,
             1,
             1),
       cex=0.7,
       lwd=2,
       legend=c("Point Null: Constant at 0",
                "SGPV: Constant at 1.25", 
                "SGPV: Constant at 1",
                "SGPV: Constant at 0.75",
                "SGPV: Constant at 0.5",
                "SGPV: Constant at 0.25"))

par(mfrow=c(1,1))
par(mar=c(4,4,2,2))

plot(n, prob.null.sg(n, null.delta1,
                  theta=0)+prob.alt.sg(n, null.delta1,
                  theta=1), 
     type="l",
     col="red",
     lwd=2,
     ylim=c(0,2),
     ylab=TeX(r'( $\beta_1$ + $\omega_0$)'), 
     main= TeX(r'(Sum of Power and Probability of Null ($\beta_1$ +$\omega_0$))'))
lines(n, prob.null.sg(n, null.delta2,
                  theta=0)+prob.alt.sg(n, null.delta2,
                  theta=1),
      col="blue",
      lwd=2)
# lines(n, prob.null.sg(n, null.delta3,
#                   theta=0),
#       col="purple",
#       lwd=2)
lines(n, prob.null.sg(n, null.delta4,
                  theta=0)+prob.alt.sg(n, null.delta4,
                  theta=1),
      col="darkgreen",
      lwd=2)
lines(n, prob.null.sg(n, null.delta5,
                  theta=0)+prob.alt.sg(n, null.delta5,
                  theta=1),
      col="orange",
      lwd=2)
lines(n, prob.null.sg(n, null.delta0,
                  theta=0)+prob.alt.sg(n, null.delta0,
                  theta=1),
      col="hotpink",
      lwd=2)
legend(172, 0.7,
       col=c("red",
             "blue",
             "purple",
             "darkgreen",
             "hotpink",
             "orange"),
       box.lty=0,
       lty=c(1,
             1,
             1,
             1,
             1,
             1,
             2,
             2,
             2,
             2,
             2,
             2),
       cex=0.7,
       lwd=2,
       legend=c(TeX(r'( Constant at 0)'),
                TeX(r'(Constant at 1.25)'), 
                 TeX(r'( Constant at 1)'), 
                 TeX(r'( Constant at 0.75)'), 
                 TeX(r'(Constant at 0.5)'), 
                 TeX(r'( Constant at 0.25)')))

par(mfrow=c(1,2))
par(mar=c(2,2,4,2))

plot(n, prob.null.sg(n, null.delta1,
                  theta=0), 
     type="l",
     col="red",
     lwd=2,
     ylim=c(0,1),
     ylab="", 
     main= TeX(r'(Probability of Null $\omega_0$)'))
lines(n, prob.null.sg(n, null.delta2,
                  theta=0),
      col="blue",
      lwd=2)
lines(n, prob.null.sg(n, null.delta3,
                  theta=0),
      col="purple",
      lwd=2)
lines(n, prob.null.sg(n, null.delta4,
                  theta=0),
      col="darkgreen",
      lwd=2)
lines(n, prob.null.sg(n, null.delta5,
                  theta=0),
      col="orange",
      lwd=2)
lines(n, prob.null.sg(n, null.delta0,
                  theta=0),
      col="hotpink",
      lwd=2)

plot(n, prob.null.sg(n, null.delta1,
                  theta=1), 
     type="l",
     col="red",
     lwd=2,
     ylim=c(0,1),
     ylab="", 
     main=TeX(r'(Probability of Null $\omega_1$)'))
lines(n, prob.null.sg(n, null.delta2,
                  theta=1),
      col="blue",
      lwd=2)
lines(n, prob.null.sg(n, null.delta3,
                  theta=1),
      col="purple",
      lwd=2)
lines(n, prob.null.sg(n, null.delta4,
                  theta=1),
      col="darkgreen",
      lwd=2)
lines(n, prob.null.sg(n, null.delta5,
                  theta=1),
      col="orange",
      lwd=2)
lines(n, prob.null.sg(n, null.delta0,
                  theta=1),
      col="hotpink",
      lwd=2)
legend(70, 0.5,
       col=c("red",
             "blue",
             "purple",
             "darkgreen",
             "hotpink",
             "orange"),
       box.lty=0,
       lty=c(1,
             1,
             1,
             1,
             1,
             1),
       cex=0.7,
       lwd=2,
       legend=c("Point Null: Constant at 0",
                "SGPV: Constant at 1.25", 
                "SGPV: Constant at 1",
                "SGPV: Constant at 0.75",
                "SGPV: Constant at 0.5",
                "SGPV: Constant at 0.25"))

par(mfrow=c(1,2))
par(mar=c(2,2,4,2))

plot(n,prob.incon.sg(n, null.delta1,
                  theta=0), 
     type="l",
     col="red",
     lwd=2,
     ylim=c(0,1),
     ylab="", 
     main=TeX(r'(Probability of Inconclusive $\gamma_0$)'))
lines(n,prob.incon.sg(n, null.delta2,
                  theta=0),
      col="blue",
      lwd=2)
lines(n,prob.incon.sg(n, null.delta3,
                  theta=0),
      col="purple",
      lwd=2)
lines(n,prob.incon.sg(n, null.delta4,
                  theta=0),
      col="darkgreen",
      lwd=2)
lines(n,prob.incon.sg(n, null.delta5,
                  theta=0),
      col="orange",
      lwd=2)
lines(n,prob.incon.sg(n, null.delta0,
                  theta=0),
      col="hotpink",
      lwd=2)

plot(n,prob.incon.sg(n, null.delta1,
                  theta=1), 
     type="l",
     col="red",
     lwd=2,
     ylim=c(0,1),
     ylab="", 
     main=TeX(r'(Probability of Inconclusive $\gamma_1$)'))
lines(n,prob.incon.sg(n, null.delta2,
                  theta=1),
      col="blue",
      lwd=2)
lines(n,prob.incon.sg(n, null.delta3,
                  theta=1),
      col="purple",
      lwd=2)
lines(n,prob.incon.sg(n, null.delta4,
                  theta=1),
      col="darkgreen",
      lwd=2)
lines(n,prob.incon.sg(n, null.delta5,
                  theta=1),
      col="orange",
      lwd=2)
lines(n,prob.incon.sg(n, null.delta0,
                  theta=1),
      col="hotpink",
      lwd=2)
legend(90, 0.8,
       col=c("red",
             "blue",
             "purple",
             "darkgreen",
             "hotpink",
             "orange"),
       box.lty=0,
       lty=c(1,
             1,
             1,
             1,
             1,
             1),
       cex=0.7,
       lwd=2,
       legend=c("Point Null: Constant at 0",
                "SGPV: Constant at 1.25", 
                "SGPV: Constant at 1",
                "SGPV: Constant at 0.75",
                "SGPV: Constant at 0.5",
                "SGPV: Constant at 0.25"))

Shrinking from 0.9 to 0.45

Let’s see how these look on a plot:

n2= seq(-10,300, by=0.1)
  
plot(n, ci.bound, 
     type="n",
     col="black",
     lty=2,
     lwd=2,
     ylim=c(-2,2.2),
     xlim=c(0,225),
     ylab="",
     xlab = "Sample Size (n)",
     main="",
     yaxt = "n")
# lines(n, -1*ci.bound,
#       col="black",
#       lty=2,
#       lwd=2)
lines(n, null.delta2,
      col="blue",
      lwd=2)
lines(n, -1*null.delta2,
      col="blue",
      lwd=2)
lines(n, null.delta4,
      col="hotpink",
      lwd=2)
lines(n, -1*null.delta4,
      col="hotpink",
      lwd=2)
lines(n, null.delta5,
      col="orange",
      lwd=2)
lines(n, -1*null.delta5,
      col="orange",
      lwd=2)
abline(h=k, col="black", lty=1,lwd=2)
abline(h=-1*k, col="black", lty=1,lwd=2)
abline(h=0, col="red", lty=1,lwd=2)
abline(h=1, col="red", lty=2,lwd=2)
polygon(c(n2, rev(n2)), c(rep(-1*k, length(n2)), rev(rep(-1*(k/2), length(n2)))),
        border = NA,
        col = rgb(210/633,210/633,210/633, alpha = 0.2))
polygon(c(n2, rev(n2)), c(rep((k/2), length(n2)), rev(rep(k, length(n2)))),
        border = NA,
        col = rgb(210/633,210/633,210/633, alpha = 0.2))
axis(2, at = c(-2,-1,-0.9,-0.45,0,
               2,1,0.9,0.45),
     labels = c(-2,-1,-0.9,-0.45,TeX(r'($\theta_0=0$)'),
               2,TeX(r'($\theta_1=1$)'),0.9,0.45),
     las=1)
legend(135, 
       2.25,
       col=c("red",
             "red",
             "black",
             "blue",
             "hotpink",
             "orange"),
       box.lty=0,
       lty=c(1,
             2,
             1,
             1,
             1,
             1),
       cex=0.9,
       lwd=2,
       legend=c(TeX(r'(Null Hypothesis $\theta_0=0$)'),
                TeX(r'(Alternative Hypothesis $\theta_1=1$)'),
                "Original SGPV: Constant at 0.9", 
                TeX(r'($delta_1(n)\propto n^{-1/4}$)'),
                TeX(r'($delta_2(n)\propto n^{-1/3}$)'),
                 TeX(r'($delta_3(n)\propto n^{-1/2}$)')))

2.4 Equivalence tests: TOST

  • Establish bioequivalence between data and an established range
  • Example: A pharmaceutical company tests for drug approval by comparing new drug’s performance to an approved drug’s performance
  • Tests are ordinary, one-sided, α-level t-tests
  • Flips the null and alternative hypotheses
n=6
dat = rnorm(n, 0, 1)
a = t.test(dat)$conf.int[1]
b =  t.test(dat)$conf.int[2]
  
x_bar = mean(dat)
S = sd(dat)
Z = (b-x_bar)/(S/sqrt(n))
  
theta_p = 0.5*0.75
theta_m = -0.5*0.75
  
out_TOST = tsum_TOST(m1=x_bar, 
            mu=0,       
            sd1=S, 
            n1=n, 
            low_eqbound=theta_m, 
            high_eqbound=theta_p, 
            alpha=0.05,
            eqbound_type = "raw")

out_TOST$TOST
##                     t        SE df   p.value
## t-test      0.1645490 0.4629848  5 0.8757443
## TOST Lower  0.9745108 0.4629848  5 0.1872798
## TOST Upper -0.6454127 0.4629848  5 0.2735524
max(out_TOST$TOST$p.value[2], out_TOST$TOST$p.value[3])
## [1] 0.2735524
out_sgpv = sgpvalue(est.lo=a, 
                       est.hi=b, 
                       null.lo=theta_m, 
                       null.hi=theta_p)

out_sgpv 
## $p.delta
## [1] 0.5
## 
## $delta.gap
## [1] NA
set.seed(999)
iter=500
n=6
results = as.data.frame(matrix(ncol=3, nrow=iter))
colnames(results) = c("sgpv", "tost", "Case")
half.case = NULL
half.case2 = NULL

for(i in 1:iter){
  dat = rnorm(n, 0, 1)
  a = t.test(dat)$conf.int[1]
  b =  t.test(dat)$conf.int[2]
  x_bar = mean(dat)
  S = sd(dat)
  Z = (b-x_bar)/(S/sqrt(n))
  
  theta_p = 0.5*0.75
  theta_m = -0.5*0.75
  
  out_TOST = tsum_TOST(m1=x_bar, 
            mu=0,       
            sd1=S, 
            n1=n, 
            low_eqbound=theta_m, 
            high_eqbound=theta_p, 
            alpha=0.05,
            eqbound_type = "raw")
  
  out_sgpv = sgpvalue(est.lo=a, 
                       est.hi=b, 
                       null.lo=theta_m, 
                       null.hi=theta_p)$p.delta
  
  results[i,1] = out_sgpv 
  results[i,2] = pmax(out_TOST$TOST$p.value[2],out_TOST$TOST$p.value[3]) 
  
  if(theta_m<a & theta_p>a & theta_p<b){
    # Case 4 some overlap
    results[i,3] = "\n Case 4: \nPartial Overlap"
  }else if(theta_m<b & theta_p>b & theta_m>a){
    # Case 3 some overlap
    results[i,3] = "\n Case 3: \nPartial Overlap"
  }else if(theta_m>b | theta_p<a){
    # Case 1 no overlap
    results[i,3] = "\n Case 1: \nNo Overlap"
  }else if((theta_m>a & theta_p<b)|(theta_m<=a & theta_p>=b)){
    # Case 2 complete overlap
    results[i,3] = "\n Case 2: \nComplete Overlap"
  }
}


library(ggplot2)
scaleFUN <- function(x) sprintf("%.1f", x)

g = ggplot(results, aes(sgpv, tost))+
  xlab("SGPVs") +
  ylab("Equivalence p-values") +
  geom_point(aes(alpha=0.2)) +
  geom_abline(intercept = 1, slope = -1,linetype='dashed', alpha=0.6)+
  xlim(c(0,1))+
  scale_x_continuous(breaks= c( 0,0.120001,0.880001,1),labels=c("0", "~0.1", "~0.9", "1"), minor_breaks = c(0.12,0.88),
                      sec.axis=sec_axis(~./1, name="",breaks= c( 0.04,0.51,0.98), labels=c("Consistent \n with Alternative", "\n Inconclusive", "Consistent \n with Null")))+ 
  scale_y_continuous(breaks= c(0,0.07,1),  labels=c("0", expression(alpha), "1"),minor_breaks = c(0.070001),
                     sec.axis=sec_axis(~./1, name="", breaks= c(0.005,0.61), labels=c("Consistent \n with Null","Inconclusive")))+
  annotate(geom="text", x=0, y=0.9, label="D",
              size=5,
           family = "Courier-Bold")+
  annotate(geom="text", x=0.85, y=0.98, label="E",
              size=5,
           family = "Courier-Bold")+
  annotate(geom="text", x=0.91, y=0.98, label="F",
              size=5,
           family = "Courier-Bold")+
  annotate(geom="text", x=0, y=0, label="H",
              size=5,
           family = "Courier-Bold")+
  annotate(geom="text", x=0.15, y=0, label="I",
              size=5,
           family = "Courier-Bold")+
  annotate(geom="text", x=0.91, y=0, label="J",
              size=5,
           family = "Courier-Bold")+
  theme_bw()+
  theme(axis.text.y =  element_text(size=10, hjust=-0.5),
        axis.text.y.right =  element_text(size=10, hjust=0.7),
         axis.text.x.top =  element_text(size=10, vjust=2),
        axis.text.x =  element_text(size=10, vjust=-0.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_line(size = 0.8,colour="darkgrey"))+ 
    guides(alpha=FALSE)
g

ggplot(results, aes(sgpv, tost, col=Case, pch=Case))+
  geom_point(aes(), cex=1.5) +
  # geom_jitter(width = 0.02, height = 0.02)+
  # ggtitle(paste0("Derived Sample Size= ",n," Iterations= ", iter))+
  xlab("SGPV") +
  ylab("TOST Eq PV") +
  # facet_wrap(~ Case) +
  geom_abline(intercept = 1, slope = -1,linetype='dashed', alpha=0.6)+
  scale_color_manual(values=c("#0000B9",
                              "#00A300",
                              "#FD49A0",
                              "#FFA500",
                              "black"))+
  scale_shape_manual(values=c(16,21,17,24,21))+
    guides(alpha=FALSE)+
  scale_y_continuous(labels=scaleFUN)+ 
  scale_x_continuous(labels=scaleFUN)+
  guides(color = guide_legend(byrow = TRUE))+
   theme_bw()+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.spacing.y = unit(0.3, 'cm'))

2.5 SGPV Variable Selection

2.5.1 ProSGPV Simulation

  • We first present an example by applying the ProSGPV algorithm to a simulated dataset by use of gen.sim.data function.

  • With sample size n = 100, number of variables p = 10, number of true signals s = 4, smallest effect size \(\beta_{min}\) = 1, largest effect size \(\beta_{max}\) = 5, autoregressive correlation ρ = 0.2 and variance σ2 = 1, signal-to-noise ratio (SNR) \(\nu\)= 2.

  • We generate outcomes Y following Gaussian distribution. gen.sim.data outputs X, Y, indices of true signals, and a vector of true coefficients.

set.seed(1)
sim.data <- gen.sim.data(n = 50, p = 10, s = 4,
                         family = "gaussian",
                         beta.min = 1, beta.max = 5,
                         rho = 0.2, nu = 2)
x <- sim.data[[1]]
y <- sim.data[[2]]
(true.index <- sim.data[[3]])
## [1] 2 4 5 7
true.beta <- sim.data[[4]]
  • By default, the two-stage algorithm is used in ProSGPV.
  • pro.sgpv function takes inputs of explanatory variables x, outcome y, outcome type family (default is “gaussian”), stage indicator (default is 2), and a GVIF indicator (default is FALSE).
  • A print method is available to show labels of the variables selected by ProSGPV.
sgpv.out.2 <- pro.sgpv(x,y)
sgpv.out.2
## Selected variables are V2 V5 V7
# true.beta
true.index
## [1] 2 4 5 7
  • Visualization of variable selection process
plot(sgpv.out.2)

Model Summary

summary(sgpv.out.2)
## 
## Call:
## lm(formula = Response ~ ., data = data.d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.0068  -2.3711   0.6028   2.7228  10.3547 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.08957    0.71297  -0.126 0.900574    
## V2          -5.44915    0.64621  -8.433 6.87e-11 ***
## V5           3.26839    0.80015   4.085 0.000175 ***
## V7          -3.68785    0.77444  -4.762 1.94e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.895 on 46 degrees of freedom
## Multiple R-squared:  0.7034, Adjusted R-squared:  0.684 
## F-statistic: 36.36 on 3 and 46 DF,  p-value: 3.382e-12
  • coef function can be used to extract the coefficient estimates of length p. When signals are sparse, some of estimates are zero. A comparison shows that the estimates are close to the true effect sizes in the simulation.
beta.hat = coef(sgpv.out.2)
rbind(beta.hat, true.beta)
##           [,1]      [,2] [,3] [,4]     [,5] [,6]      [,7] [,8] [,9] [,10]
## beta.hat     0 -5.449154    0    0 3.268389    0 -3.687846    0    0     0
## true.beta    0 -5.000000    0    1 2.333333    0 -3.666667    0    0     0
  • predict function can be used to predict outcomes using the selected model.
  • In-sample prediction can be made by calling predict(sgpv.out.2) and out-of-sample prediction can be made by feeding new data set into the newdata argument.
preds = predict(sgpv.out.2)

cbind(preds, y)
##          preds           y
## 1    8.2993645  14.9532559
## 2   -8.8659786 -14.1402243
## 3    8.1576658   5.6770472
## 4   16.7141593  21.2556549
## 5   -6.8915154 -11.9104536
## 6    6.0995442  -6.9072594
## 7   -3.3938972  -6.4016210
## 8    2.6410697  -2.2790190
## 9   -7.8186176  -6.5374117
## 10   4.8723877   3.9731037
## 11 -10.7865487  -9.6668518
## 12   0.6084523  -1.4338933
## 13   3.8841387   5.2537291
## 14  -3.8550714  -2.9899402
## 15   4.7869614   9.6090104
## 16  -1.1651049  -7.4962893
## 17   5.3357619   4.8054879
## 18  -2.2207106  -6.9379824
## 19  -3.1640880   0.2746154
## 20   5.2684414   5.8258520
## 21   9.4316832  -0.1262995
## 22 -15.5982310 -14.6354156
## 23  -9.0735701  -3.7066464
## 24 -14.3862762  -8.9421481
## 25  -7.7604018  -7.1121788
## 26   2.3951003  -2.7577937
## 27  -2.4709820   1.3670330
## 28   3.6316127   3.6091253
## 29  -2.6335952  -3.7277478
## 30  -6.4989520   2.3933283
## 31 -10.2122164 -11.4924695
## 32  -3.5543368  -2.5775804
## 33   5.7377149  10.7925290
## 34   3.2970610   6.2475028
## 35  10.0276588   8.8856192
## 36 -10.2391253 -11.7966930
## 37   4.3474561   6.5661946
## 38  -0.8008003   1.0013047
## 39  -4.0768054  -4.7068644
## 40   0.5462697  10.9009497
## 41  -3.4339914  -2.7473043
## 42  -2.6547422  -6.6247673
## 43   7.5189727   9.6165887
## 44  -5.4459692 -14.7809394
## 45 -12.0955843  -9.2048255
## 46   4.2851927   5.2962132
## 47 -15.1612720 -16.5177728
## 48  -2.5288758  -8.1071853
## 49  -8.7118174  -9.0223750
## 50   5.1952891  14.5666888

In addition to the two-stage algorithm, one-stage algorithm can also be used to select variables when n > p. The computation time is shorter for the one-stage algorithm at the expense of slightly reduced support recovery rate in the limit.

sgpv.out.1 <- pro.sgpv(x,y,stage=1)
sgpv.out.1
## Selected variables are V2 V5 V7

The one-stage algorithm missed V4 and only selected three variables. What happened?

plot(sgpv.out.1)

2.5.2 ProSGPV Real Life Example

We can load the Tehran Housing data t.housing stored in the package.

A dataset containing Tehran housing data. The data set has 372 observations. There are 26 explanatory variables at baseline, including 7 project physical and financial features (V2-V8) and 19 economic variables and indices (V11-V29). The outcome (V9) is the sales price of a real estate single-family residential apartment.

set.seed(999)

# prepare the data
x = t.housing[,-ncol(t.housing)]
y = t.housing$V9

# run ProSGPV
out.sgpv.2 <- pro.sgpv(x = x, y = y)

The two-stage algorithm selects the following variables.

out.sgpv.2
## Selected variables are V8 V12 V13 V15 V17 V26
  • V9: Actual sales price (outcome)
  • V8: Price of the unit at the beginning of the project per square meter
  • V12: Building services index for preselected base year
  • V13: Wholesale price index of building materials for the base year
  • V15: Cumulative liquidity
  • V17: Land price index for the base year
  • V26: CPI of housing, water, fuel & power in the base year

We can view the summary of the final model.

summary(out.sgpv.2)
## 
## Call:
## lm(formula = Response ~ ., data = data.d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1276.35   -75.59    -9.58    59.46  1426.22 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.708e+02  3.471e+01   4.920 1.31e-06 ***
## V8           1.211e+00  1.326e-02  91.277  < 2e-16 ***
## V12         -2.737e+01  2.470e+00 -11.079  < 2e-16 ***
## V13          2.185e+01  2.105e+00  10.381  < 2e-16 ***
## V15          2.041e-03  1.484e-04  13.756  < 2e-16 ***
## V17         -3.459e+00  8.795e-01  -3.934  0.00010 ***
## V26         -4.683e+00  1.780e+00  -2.630  0.00889 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 194.8 on 365 degrees of freedom
## Multiple R-squared:  0.9743, Adjusted R-squared:  0.9739 
## F-statistic:  2310 on 6 and 365 DF,  p-value: < 2.2e-16

Coefficient estimates can be extracted by use of coef.

coef(out.sgpv.2)
##  [1]   0.000000000   0.000000000   0.000000000   0.000000000   0.000000000
##  [6]   0.000000000   1.210755031   0.000000000 -27.367601037  21.853920174
## [11]   0.000000000   0.002040784   0.000000000  -3.459496972   0.000000000
## [16]   0.000000000   0.000000000   0.000000000   0.000000000   0.000000000
## [21]   0.000000000   0.000000000  -4.683172725   0.000000000   0.000000000
## [26]   0.000000000

In-sample prediction can be made using S3 method predict and an external sample can be provided to make out-of-sample prediction with an argument of newx in the predict function.

head(predict(out.sgpv.2))
##         1         2         3         4         5         6 
## 1565.7505 3573.7793  741.7576  212.1297 5966.1682 5724.0172

S3 method plot can be used to visualize the variable selection process.

First, we plot the full solution path with point estimates and 95% confidence intervals. Note that the null region is in grey. The selected variables are colored blue on the y-axis. lambda.max controls the limit of the X-axis.

plot(out.sgpv.2, lambda.max = 0.01)

Alternatively, we can plot the confidence bound that is closer to the null.

plot(out.sgpv.2,lpv=1,lambda.max=0.01)